TL;DR

We add cell type information in X and look at W.

Data

As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).

Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]

filter <- apply(assay(allen_core)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 11545

To speed up the computations, I will focus on the top 1,000 most variable genes.

allen_core <- allen_core[filter,]
core <- assay(allen_core)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)

detection_rate <- colSums(core>0)
coverage <- colSums(core)

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        PC1         PC2 detection_rate   coverage
## PC1             1.00000000 -0.01052424      0.8066812  0.4453172
## PC2            -0.01052424  1.00000000     -0.3601716 -0.3286485
## detection_rate  0.80668124 -0.36017158      1.0000000  0.5275845
## coverage        0.44531717 -0.32864852      0.5275845  1.0000000

X cell types, W

Both Vmu and Vpi have intercept, common dispersion is TRUE.

print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2,
                                    X=model.matrix(~bio))))
##    user  system elapsed 
## 138.696  31.569  74.610

Plot the results with cells colored according to their biological condition.

plot(zinb_X@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_X@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("topleft", levels(cl), fill=cols2)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1],
                 W2=zinb_X@W[,2],
                 gamma_mu = zinb_X@gamma_mu[1,],
                 gamma_pi = zinb_X@gamma_pi[1,], 
                 detection_rate=detection_rate, 
                 coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu    gamma_pi
## W1              1.00000000  0.06243371 -0.14284292  0.03336945
## W2              0.06243371  1.00000000  0.04833728  0.22845102
## gamma_mu       -0.14284292  0.04833728  1.00000000 -0.25597360
## gamma_pi        0.03336945  0.22845102 -0.25597360  1.00000000
## detection_rate -0.04056153 -0.26312871  0.29942803 -0.99261966
## coverage        0.04966279 -0.07480962  0.87261478 -0.49206198
##                detection_rate    coverage
## W1                -0.04056153  0.04966279
## W2                -0.26312871 -0.07480962
## gamma_mu           0.29942803  0.87261478
## gamma_pi          -0.99261966 -0.49206198
## detection_rate     1.00000000  0.52758451
## coverage           0.52758451  1.00000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_X)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_X))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(3, 2))
boxplot(zinb_X@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[3,], main="Beta_Pi")
abline(h=0)

par(mfrow=c(2, 2))
plot(t(zinb_X@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_X@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

Highest beta and alpha

beta_1 : Rbp4 VS (layer5) Ntsr1 (layer6)
beta_2 : Scnn1a (layer4) VS Ntsr1 (layer6)

get_DE <- function(zinb_res, core, beta = T, mu = T, component = 2, n=10){
  if (beta & mu){
    mat = zinb_X@beta_mu[component, ]
  }else if (beta & !mu){
    mat = zinb_X@beta_pi[component, ]
  }else if (!beta & mu){
    mat = zinb_X@alpha_mu[component, ]
  }else{
    mat = zinb_X@alpha_pi[component, ]
  }
  idx = order(-abs(mat))[1:n]
  de = mat[idx]
  names(de) = rownames(core)[idx]
  de
}
#beta_mu
get_DE(zinb_res, core, beta = T, mu = T, 2)
## Serpine2  Fam19a1 Cacna2d3  Slc30a3   Ccdc80   Fam84a  Slc17a6   Atp1b2 
## 7.711999 7.357277 7.212348 6.113664 5.546204 5.536171 5.426343 5.122326 
##     Car4     Rorb 
## 4.665507 4.497371
get_DE(zinb_res, core, beta = T, mu = T, 3)
##   Slc17a6  Cacna2d3    Thsd7a      Car4   Fam19a1    Bcl11b    Ccdc80 
##  8.119490  7.965803  7.856950  7.816833  7.737371 -7.615028  7.512144 
##      Rorb    Sema3e   Slc30a3 
##  7.373614 -7.031342  6.985067
#beta_pi
get_DE(zinb_res, core, beta = T, mu = F, 2)
##    Galntl6      Dcaf7      Gosr2       Nfyb     Pik3c3     Lmbrd2 
## -151.57620 -136.97541  -85.62298  -65.35149  -61.08863  -57.04164 
##     Pcyox1     Scamp4      Ntrk3     Rmnd5a 
##  -52.91448  -51.33979  -42.73209  -39.72687
get_DE(zinb_res, core, beta = T, mu = F, 3)
##      Dcaf7      Gosr2    Slitrk1      Ntrk3       Rorb    Rasgrf1 
## -212.67077 -170.21445  -88.63791  -81.37480  -63.12386  -39.03657 
##     Lmbrd2      Pamr1       Cd34     Vstm2a 
##   24.23487  -22.22490  -20.64954  -18.19185
#alpha_mu
get_DE(zinb_res, core, beta = F, mu = T, 1)
##     Ccdc80    Slc17a6   Arhgap25      Meis2     Thsd7a       Rorb 
##  1.1700610  1.1575579  1.0116313 -0.9567975  0.9203791  0.9109857 
##      Pamr1     Deptor       Cux2      Wipf3 
##  0.8686582  0.8608759  0.8037590  0.7367527
get_DE(zinb_res, core, beta = F, mu = T, 2)
##       Bmp3    Pacsin2     Thsd7a       Car4     Sema3e    Fam19a1 
##  1.3832804 -1.1148721 -0.9768006 -0.9447295  0.7614285 -0.6666534 
##      Aldoc    Galntl6      Ptprt   Arhgap25 
##  0.6074339 -0.5768886  0.5483183  0.5410785
#alpha_pi
get_DE(zinb_res, core, beta = F, mu = F, 1)
##      Dcaf7       Nfyb      Gosr2     Lmbrd2    Galntl6       Pomk 
##  42.779152  30.074732  25.635927  22.716046 -20.499934  15.845048 
##      Ntrk3     Rmnd5a     Pcyox1     Thsd7a 
##  15.670698  13.676121  -9.849904   7.121190
get_DE(zinb_res, core, beta = F, mu = F, 2)
##       Nfyb    Slitrk1      Dcaf7      Gosr2     Rmnd5a      Ntrk3 
##  20.569945 -20.218185 -18.382746 -14.286300  11.860824 -11.580074 
##    Galntl6       Rorb       Pomk    Rasgrf1 
## -11.247704   8.540726   7.402234   5.489528

All 1,000 most variable genes, before fit

require(gplots)
# all 1,000 most variable genes, before fit
heatmap.2(log1p(core), ColSideColor = cols[bio], trace = 'none')

All 1,000 most variable genes, after fit

heatmap.2(log1p(t(getMu(zinb_X))), ColSideColor = cols[bio], trace = 'none')

heatmap.2(log1p(t(getMu(zinb_X))), ColSideColor = cols2[cl], trace = 'none')

heatmap.2(t(getPi(zinb_X)), ColSideColor = cols[bio], trace = 'none')

heatmap.2(t(getPi(zinb_X)), ColSideColor = cols2[cl], trace = 'none')

highest beta_mu, after fit

rbp4_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = T, 2,n=25))
scnn1a_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = T, 3, n=25))
mu_de = unique(c(rbp4_ntsr1, scnn1a_ntsr1))
length(mu_de)
## [1] 30
fitted_mu = log1p(t(getMu(zinb_X)))
rownames(fitted_mu) = rownames(core)
fitted_mu = fitted_mu[mu_de, ]
heatmap.2(fitted_mu, ColSideColor = cols[bio], trace = 'none')

heatmap.2(fitted_mu, ColSideColor = cols2[cl], trace = 'none')

highest beta_pi, after fit

rbp4_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = F, 2,n=50))
scnn1a_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = F, 3, n=50))
pi_de = unique(c(rbp4_ntsr1, scnn1a_ntsr1))
length(pi_de)
## [1] 82
fitted_pi = t(getPi(zinb_X))
rownames(fitted_pi) = rownames(core)
fitted_pi = fitted_pi[pi_de, ]
heatmap.2(fitted_pi, ColSideColor = cols[bio], trace = 'none')

heatmap.2(fitted_pi, ColSideColor = cols2[cl], trace = 'none')

highest alpha_mu, after fit

rbp4_ntsr1 = names(get_DE(zinb_res, core, beta = F, mu = T, 1,n=25))
scnn1a_ntsr1 = names(get_DE(zinb_res, core, beta = F, mu = T, 2, n=25))
mu_de = unique(c(rbp4_ntsr1, scnn1a_ntsr1))
length(mu_de)
## [1] 45
fitted_mu = log1p(t(getMu(zinb_X)))
rownames(fitted_mu) = rownames(core)
fitted_mu = fitted_mu[mu_de, ]
heatmap.2(fitted_mu, ColSideColor = cols[bio], trace = 'none')

heatmap.2(fitted_mu, ColSideColor = cols2[cl], trace = 'none')

highest alpha_pi, after fit

rbp4_ntsr1 = names(get_DE(zinb_res, core, beta = F, mu = F, 1,n=25))
scnn1a_ntsr1 = names(get_DE(zinb_res, core, beta = F, mu = F, 2, n=25))
pi_de = unique(c(rbp4_ntsr1, scnn1a_ntsr1))
length(pi_de)
## [1] 36
fitted_pi = t(getPi(zinb_X))
rownames(fitted_pi) = rownames(core)
fitted_pi = fitted_pi[pi_de, ]
heatmap.2(fitted_pi, ColSideColor = cols[bio], trace = 'none')

heatmap.2(fitted_pi, ColSideColor = cols2[cl], trace = 'none')

Goodness of fit

myPlotMD <- function(x, y, main= '', xlab = 'Mean', ylab = 'Difference'){
  mean = .5*(x + y)
  diff = x - y
  smoothScatter(mean, diff, xlab = xlab, ylab = ylab, main = main)
  abline(h = 0, col = 'red', lwd = 2)
}

myPlotMD(log2(rowMeans(t(getMu(zinb_X)))), log2(rowMeans(core)),
         main = 'estimated vs observed mean count')

estimated = unlist(t(getPi(zinb_X)))
observed = rep(rowMeans(core > 0), each = ncol(core))
myPlotMD(estimated, observed, main = 'estimated vs observed zero probability')

mu = t(getMu(zinb_X))
mu6 = mu[, as.vector(bio) == 'Ntsr1-Cre_GN220']
mu5 = mu[, as.vector(bio) == 'Rbp4-Cre_KL100']
mu6 = log2(rowMeans(mu6))
mu5 = log2(rowMeans(mu5))
myPlotMD(mu6, mu5, main="MD L6 vs L5, estimated mu")
muDE = names(get_DE(zinb_res, core, beta = T, mu = T, 2, n = 25))
muIdx = rownames(core) %in% muDE
points(.5*(mu6 + mu5)[muIdx], (mu6 - mu5)[muIdx], col="cyan", pch=1, lwd=2)
piDE = names(get_DE(zinb_res, core, beta = T, mu = F, 2, n = 25))
piIdx = rownames(core) %in% piDE
points(.5*(mu6 + mu5)[piIdx], (mu6 - mu5)[piIdx], col="magenta", pch=3, lwd=2)

pi = t(getPi(zinb_X))
pi6 = pi[, as.vector(bio) == 'Ntsr1-Cre_GN220']
pi5 = pi[, as.vector(bio) == 'Rbp4-Cre_KL100']
pi6 = rowMeans(pi6)
pi5 = rowMeans(pi5)
myPlotMD(pi6, pi5, main = "MD L5 vs L4, estimated pi")
points(.5*(pi6 + pi5)[muIdx], (pi6 - pi5)[muIdx], col="cyan", pch=1, lwd=2)
points(.5*(pi6 + pi5)[piIdx], (pi6 - pi5)[piIdx], col="magenta", pch=3, lwd=2)

X cell types, no W

Both Vmu and Vpi have intercept, common dispersion is TRUE.

print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 0,
                                    X=model.matrix(~bio))))
##    user  system elapsed 
## 120.094  20.100  64.835
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_X)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_X))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(3, 2))
boxplot(zinb_X@beta_mu[1,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[1,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[2,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[2,], main="Beta_Pi")
abline(h=0)
boxplot(zinb_X@beta_mu[3,], main="Beta_Mu")
abline(h=0)
boxplot(zinb_X@beta_pi[3,], main="Beta_Pi")
abline(h=0)

Highest beta and alpha

beta_1 : Rbp4 (layer5) VS Ntsr1 (layer6)
beta_2 : Scnn1a (layer4) VS Ntsr1 (layer6)

#beta_mu
get_DE(zinb_res, core, beta = T, mu = T, 2)
##   Ccdc80  Fam19a1 Serpine2  Slc17a6 Cacna2d3     Car4   Thsd7a  Slc30a3 
## 8.348349 8.010856 7.900178 7.384489 7.099613 6.981929 6.816104 6.290389 
##   Fam84a     Rorb 
## 6.140796 5.662795
get_DE(zinb_res, core, beta = T, mu = T, 3)
##      Car4    Ccdc80  Cacna2d3    Thsd7a   Fam19a1   Slc17a6   Slc30a3 
##  8.158510  7.926219  7.858760  7.634242  7.527618  7.464290  7.035235 
##    Bcl11b      Rorb  Serpine2 
## -6.960218  6.860313  6.484142
#beta_pi
get_DE(zinb_res, core, beta = T, mu = F, 2)
##     Plcxd2    Rasgrf1   Cacna2d3     Tmem91        Ptn     Tmem35 
## -13.703292 -12.392111 -12.332379 -11.890247 -11.265895 -11.061337 
##      Foxp2   Tmem178b       Rprm       Tle4 
##  10.760143 -10.535216  10.123317   9.603131
get_DE(zinb_res, core, beta = T, mu = F, 3)
##    Atp1b2      Coch    Plcxd2      Nrep    Brinp3    Camk2d       Ptn 
## -14.06312 -13.28861 -13.26773 -12.83176 -12.63725 -12.40054 -11.94713 
##     Unc5d  Tmem178b     Rims3 
## -11.86952 -11.64822 -11.64277

All 1,000 most variable genes, after fit

heatmap.2(log1p(t(getMu(zinb_X))), ColSideColor = cols[bio], trace = 'none')

heatmap.2(log1p(t(getMu(zinb_X))), ColSideColor = cols2[cl], trace = 'none')

heatmap.2(t(getPi(zinb_X)), ColSideColor = cols[bio], trace = 'none')

heatmap.2(t(getPi(zinb_X)), ColSideColor = cols2[cl], trace = 'none')

highest beta_mu, after fit

rbp4_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = T, 2,n=25))
scnn1a_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = T, 3, n=25))
mu_de = unique(c(rbp4_ntsr1, scnn1a_ntsr1))
length(mu_de)
## [1] 30
fitted_mu = log1p(t(getMu(zinb_X)))
rownames(fitted_mu) = rownames(core)
fitted_mu = fitted_mu[mu_de, ]
heatmap.2(fitted_mu, ColSideColor = cols[bio], trace = 'none')

heatmap.2(fitted_mu, ColSideColor = cols2[cl], trace = 'none')

highest beta_pi, after fit

rbp4_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = F, 2,n=25))
scnn1a_ntsr1 = names(get_DE(zinb_res, core, beta = T, mu = F, 3, n=25))
pi_de = unique(c(rbp4_ntsr1, scnn1a_ntsr1))
length(pi_de)
## [1] 42
fitted_pi = t(getPi(zinb_X))
rownames(fitted_pi) = rownames(core)
fitted_pi = fitted_pi[pi_de, ]
heatmap.2(fitted_pi, ColSideColor = cols[bio], trace = 'none')

heatmap.2(fitted_pi, ColSideColor = cols2[cl], trace = 'none')

Goodness of fit

myPlotMD(log2(rowMeans(t(getMu(zinb_X)))), log2(rowMeans(core)),
         main = 'estimated vs observed mean count')

estimated = unlist(t(getPi(zinb_X)))
observed = rep(rowMeans(core > 0), each = ncol(core))
myPlotMD(estimated, observed, main = 'estimated vs observed zero probability')

mu = t(getMu(zinb_X))
mu6 = mu[, as.vector(bio) == 'Ntsr1-Cre_GN220']
mu5 = mu[, as.vector(bio) == 'Rbp4-Cre_KL100']
mu6 = log2(rowMeans(mu6))
mu5 = log2(rowMeans(mu5))
myPlotMD(mu6, mu5, main="MD L6 vs L5, estimated mu")
muDE = names(get_DE(zinb_res, core, beta = T, mu = T, 2, n = 25))
muIdx = rownames(core) %in% muDE
points(.5*(mu6 + mu5)[muIdx], (mu6 - mu5)[muIdx], col="cyan", pch=1, lwd=2)
piDE = names(get_DE(zinb_res, core, beta = T, mu = F, 2, n = 25))
piIdx = rownames(core) %in% piDE
points(.5*(mu6 + mu5)[piIdx], (mu6 - mu5)[piIdx], col="magenta", pch=3, lwd=2)

pi = t(getPi(zinb_X))
pi6 = pi[, as.vector(bio) == 'Ntsr1-Cre_GN220']
pi5 = pi[, as.vector(bio) == 'Rbp4-Cre_KL100']
pi6 = rowMeans(pi6)
pi5 = rowMeans(pi5)
myPlotMD(pi6, pi5, main = "MD L5 vs L4, estimated pi")
points(.5*(pi6 + pi5)[muIdx], (pi6 - pi5)[muIdx], col="cyan", pch=1, lwd=2)
points(.5*(pi6 + pi5)[piIdx], (pi6 - pi5)[piIdx], col="magenta", pch=3, lwd=2)